Text analysis: title and abstract of male and female speakers

Abstracts

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289, 230, 167)
data <- data %>% filter(!id %in% IDs)

Using abstracts in English (original or translated)

data <- data  %>% filter(!is.na(abstract_english)) 

Number of abstracts per group

table(data$gender)
## 
##   F   M 
##  99 138
table(data$position_cat,data$gender)
##            
##              F  M
##   others     4  1
##   postdoc   21 21
##   professor 21 60
##   student   52 56

Tidytext

text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                             abstract_english, title_english) %>%
  mutate(text = paste(title_english, abstract_english)) %>%
  unnest_tokens(output=word,input=text)

stop_w <- tibble(word = stopwords("en"))

# remove stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word")  %>% arrange(word) 

# remove numbers and other characters
text <- text %>% slice(-c(1:290)) %>% # number and some symbols
          filter(nchar(word)!=1) %>% # letters alone
          filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ") )# remove acronyms, symbols


# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals",
            "ants","anurans","abundances","adjustments","adults","affects",
            "applications","approaches", "bees","builds", "birds","palms",
            "cerrados","challenges", "outputs",  "queens", "techniques",
            "continents","crops", "consequences", "questions",
            "decisions","declines","determines","determinants", "defenses",
            "dynamics",
            "economics", "ecosystems","environments", "experiences",
            "forests","grasslands",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
  • Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
               c("abilities","ability"),
               c("advancement", "advance"),
                  c("absent","absence"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("frequencies", "frequency"),
               c("frequently", "frequency"),
               c("frequent", "frequency"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

WORDS - all data

table(text$gender)
## 
##     F     M 
## 10558 13483
table(text$position_cat ,text$gender)
##            
##                F    M
##   others     262  139
##   postdoc   2792 2494
##   professor 2062 5319
##   student   5351 5531

Mean number of words by title+abstract

text %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ylab("Number of words in title + abtract")

  ggbeeswarm::geom_quasirandom(size=3, shape=21) 
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_quasirandom

20 more common workds

text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 384
ecology 185
forest 174
model 157
study 157
environment 139
evolution 134
can 129
landscape 127
population 124
diversity 112
nature 102
community 100
male 97
plant 97
different 95
patterns 88
present 86
areas 84
animal 82
interaction 82

Word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

Word frequencies by gender

props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  #geom_point(size=2.5, alpha=0.5)+
  geom_jitter(size=2.5, alpha=0.2)+
  geom_text_repel(aes(label=label), size=3.2)+
  scale_x_log10(name="Male most used words",
                labels = percent_format()) +
  scale_y_log10(name="Female most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)

Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(props$proportion_F, props$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  props$proportion_F and props$proportion_M
## t = 71.272, df = 1648, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8565945 0.8802780
## sample estimates:
##       cor 
## 0.8689328

Highly correlated -> it means they tend to use the same frequency of main word

20 words with the largest differences in frequency

prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

10 “exclusive” words for each group

text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

WORDS - professors only data

textP <- text %>% filter(position_cat == "professor")

table(textP$gender)
## 
##    F    M 
## 2062 5319

Mean number of words by abstract

textP %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ggbeeswarm::geom_quasirandom(size=3, shape=21) 

20 most commmon words

textP %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 90
ecology 68
environment 52
evolution 52
population 52
nature 44
plant 43
study 42
model 41
can 39
ecosystem 38
diversity 35
society 33
water 32
pollination 30
research 30
biology 29
interaction 29
science 29
present 26

Words Frequency by gender

propsP <- textP %>%
    count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
ggplot(propsP, aes(x=proportion_M, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
 # geom_point(size=2.5, alpha=0.3) +
  geom_jitter(size=2.5, alpha=0.3)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",   limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females.

Labels for the 20 words with largest differences in frequency.

Correlation of word frequeency use between gender:

cor.test(propsP$proportion_F, propsP$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  propsP$proportion_F and propsP$proportion_M
## t = 21.015, df = 560, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6150815 0.7078441
## sample estimates:
##     cor 
## 0.66401

20 words with the largest differences in frequency

propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

text_idP <- textP %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

10 “exclusive” words for each group

text_idP$word <- as.factor(text_idP$word)
text_idP %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

Topic model - all data

matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

Choosing number of topics: comparing AIC

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, base=T)
##         AIC      dAIC     df   
## ap_lda2 371013.1      0.0 9655 
## ap_lda3 373602.8   2589.7 14482
## ap_lda4 377235.0   6221.9 19309
## ap_lda5 382315.5  11302.3 24136

two-topics model seems the most plausible model

Word-topic probabilities

10 words with the largest probabilities for each group

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities - classifying the abstracts

and comparing the two groups by gender (if there is a difference in frequency)

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 39 60
##   M 63 74
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
gender 1 2
F 39% (39) 61% (60)
M 46% (63) 54% (74)
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)

Chi-square test

chisq.test(classifi$gender, classifi$topic)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  classifi$gender and classifi$topic
## X-squared = 0.76661, df = 1, p-value = 0.3813

Topic model - Professors only

matextP <- textP %>% 
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)
ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)
##          AIC      dAIC     df   
## ap_lda2P 111913.7      0.0 5017 
## ap_lda3P 113740.0   1826.3 7525 
## ap_lda4P 116048.9   4135.2 10033

word-topic probabilities

ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifiP$gender, classifiP$topic)
##    
##      1  2
##   F 15  6
##   M 27 32
library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
gender 1 2
F 71% (15) 29% (6)
M 46% (27) 54% (32)
classifiP %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)

Chi-square test

chisq.test(classifiP$gender, classifiP$topic)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  classifiP$gender and classifiP$topic
## X-squared = 3.1266, df = 1, p-value = 0.07702

Sentiment analysis

Chapter 2, Silge & RObinson. 2018

  • The NRC lexicon categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # 
 with 13,865 more rows
  • The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # 
 with 6,776 more rows
  • The AFINN lexicon assigns words with a score that runs between -5 and 5, with neg‐ ative scores indicating negative sentiment and positive scores indicating positive sen‐ timent.
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # 
 with 2,467 more rows

PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?

Score words difference in female and male abstracts

All data

affword <- get_sentiments("afinn")

affc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

Professors

affword <- get_sentiments("afinn")

affcP <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affcP %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica.

Precisa saber como ponderar pelo total de palavras.

All data

nrcword <- get_sentiments("nrc")

nrc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin() +
    geom_quasirandom()

Professors

nrcword <- get_sentiments("nrc")

nrc <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin()+
    geom_quasirandom()

nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()+
  ggtitle("Positive words")

Frequency of sentiment words per abstract

All data

bingword <- get_sentiments("bing")

bing <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()+
    geom_quasirandom()

Professors

bingword <- get_sentiments("bing")

bing <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()